/*
 * Copyright (c) 2019-2020 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifdef __aarch64__

#include "arm_gemm.hpp"
#include "quantized.hpp"
#include "utils.hpp"

#include <cassert>

namespace arm_gemm {

template<>
void row_sums_indirect(
    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
    size_t M, int32_t *out_ptr, const Requantize32 *qp
)
{
    struct KernelArgs {
        unsigned int num_strings;
        const unsigned int *string_lengths;
        unsigned int input_initial_col;
    } ka;

    unsigned long flags=0;
    void *input_ptr;
    size_t input_offset;

    if (A_arg.is_indirect) {
        input_ptr=(void *)(A_arg.indirect.ptr);
        input_offset=A_arg.indirect.start_row;
        ka.input_initial_col=A_arg.indirect.start_col;
        flags |= 0x8;
    } else {
        assert(num_strings==1);
        input_ptr=(void *)(A_arg.direct.base);
        input_offset=A_arg.direct.stride;
    }

    ka.num_strings = num_strings;
    ka.string_lengths = string_lengths;

    __asm__ __volatile__(
      "add x19, %x[qp], %[b_offset]\n"
      "ld1r { v2.4s }, [x19]\n"
      "neg v2.4s, v2.4s\n"
      "1:"  // Row loop
      "cmp %x[M], #0x6\n"
      "bge 86f\n"
      "cmp %x[M], #0x4\n"
      "bgt 69f\n"
      "beq 52f\n"
      "cmp %x[M], #0x2\n"
      "bgt 35f\n"
      "beq 18f\n"
      "movi v1.8h, #0x0\n"
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "movi v0.4s, #0x0\n"
      "mov x9, #0x0\n"
      "mov x28, #0x0\n"
      "2:"  // Height 1: String loop
      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr w27, [x19, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 3f\n"
      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x19, x19, %x[input_offset], LSL #3\n"
      "ldr x26, [x19, #0x0]\n"
      "cbnz x28, 4f\n"
      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x19\n"
      "b 4f\n"
      "3:"  // Height 1: setup direct input
      "mov x26, %x[input_ptr]\n"
      "4:"  // Height 1: input setup done
      "cmp x27, #0x10\n"
      "blt 8f\n"
      "cmp x27, #0x20\n"
      "blt 7f\n"
      "5:"  // Height 1: Multiply loop: Main loop head
      "ldr q31, [x26, #0x0]\n"
      "cmp x9, #0x7e\n"
      "add x26, x26, #0x10\n"
      "blt 6f\n"
      "uadalp v0.4s, v1.8h\n"
      "movi v1.8h, #0x0\n"
      "mov x9, #0x0\n"
      "6:"  // Height 1: Multiply loop: unique 1: no collapse
      "uadalp v1.8h, v31.16b\n"
      "add x9, x9, #0x1\n"
      "sub x27, x27, #0x10\n"
      "cmp x27, #0x20\n"
      "bge 5b\n"
      "7:"  // Height 1: Multiply loop: Single iteration only
      "sub x27, x27, #0x10\n"
      "ldr q31, [x26, #0x0]\n"
      "add x26, x26, #0x10\n"
      "uadalp v1.8h, v31.16b\n"
      "8:"  // Height 1: Multiply loop: Main loop skip
      "cbz x27, 17f\n"
      "tbz x27, #3, 12f\n"
      "ldr d31, [x26], #0x8\n"
      "tbz x27, #2, 10f\n"
      "ld1 { v31.s }[2], [x26], #0x4\n"
      "tbz x27, #1, 9f\n"
      "ld1 { v31.h }[6], [x26], #0x2\n"
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[14], [x26]\n"
      "b 16f\n"
      "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[12], [x26]\n"
      "b 16f\n"
      "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
      "tbz x27, #1, 11f\n"
      "ld1 { v31.h }[4], [x26], #0x2\n"
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[10], [x26]\n"
      "b 16f\n"
      "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[8], [x26]\n"
      "b 16f\n"
      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
      "tbz x27, #2, 14f\n"
      "ldr s31, [x26], #0x4\n"
      "tbz x27, #1, 13f\n"
      "ld1 { v31.h }[2], [x26], #0x2\n"
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[6], [x26]\n"
      "b 16f\n"
      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[4], [x26]\n"
      "b 16f\n"
      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
      "tbz x27, #1, 15f\n"
      "ldr h31, [x26], #0x2\n"
      "tbz x27, #0, 16f\n"
      "ld1 { v31.b }[2], [x26]\n"
      "b 16f\n"
      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
      "ldr b31, [x26, #0x0]\n"
      "16:"  // Height 1: Multiply loop: Ragged operand read: Done
      "uadalp v1.8h, v31.16b\n"
      "17:"  // Height 1: Multiply loop: No odd multiplies
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 2b\n"
      "uadalp v0.4s, v1.8h\n"
      "addp v0.4s, v0.4s, v0.4s\n"
      "addp v0.4s, v0.4s, v0.4s\n"
      "mul v0.4s, v0.4s, v2.4s\n"
      "str s0, [%x[out_ptr]], #0x4\n"
      "b 104f\n"
      "18:"  // Height 2
      "movi v1.8h, #0x0\n"
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "mov x9, #0x0\n"
      "movi v0.4s, #0x0\n"
      "mov x28, #0x0\n"
      "movi v30.8h, #0x0\n"
      "movi v29.4s, #0x0\n"
      "19:"  // Height 2: String loop
      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr w27, [x19, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 20f\n"
      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x19, x19, %x[input_offset], LSL #3\n"
      "ldr x26, [x19, #0x0]\n"
      "ldr x25, [x19, #0x8]\n"
      "cbnz x28, 21f\n"
      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x19\n"
      "add x25, x25, x19\n"
      "b 21f\n"
      "20:"  // Height 2: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, %x[input_offset]\n"
      "21:"  // Height 2: input setup done
      "cmp x27, #0x10\n"
      "blt 25f\n"
      "cmp x27, #0x20\n"
      "blt 24f\n"
      "22:"  // Height 2: Multiply loop: Main loop head
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "cmp x9, #0x7e\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "blt 23f\n"
      "uadalp v0.4s, v1.8h\n"
      "movi v1.8h, #0x0\n"
      "uadalp v29.4s, v30.8h\n"
      "movi v30.8h, #0x0\n"
      "mov x9, #0x0\n"
      "23:"  // Height 2: Multiply loop: unique 2: no collapse
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "add x9, x9, #0x1\n"
      "sub x27, x27, #0x10\n"
      "cmp x27, #0x20\n"
      "bge 22b\n"
      "24:"  // Height 2: Multiply loop: Single iteration only
      "sub x27, x27, #0x10\n"
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "25:"  // Height 2: Multiply loop: Main loop skip
      "cbz x27, 34f\n"
      "tbz x27, #3, 29f\n"
      "ldr d31, [x26], #0x8\n"
      "ldr d28, [x25], #0x8\n"
      "tbz x27, #2, 27f\n"
      "ld1 { v31.s }[2], [x26], #0x4\n"
      "ld1 { v28.s }[2], [x25], #0x4\n"
      "tbz x27, #1, 26f\n"
      "ld1 { v31.h }[6], [x26], #0x2\n"
      "ld1 { v28.h }[6], [x25], #0x2\n"
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[14], [x26]\n"
      "ld1 { v28.b }[14], [x25]\n"
      "b 33f\n"
      "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[12], [x26]\n"
      "ld1 { v28.b }[12], [x25]\n"
      "b 33f\n"
      "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
      "tbz x27, #1, 28f\n"
      "ld1 { v31.h }[4], [x26], #0x2\n"
      "ld1 { v28.h }[4], [x25], #0x2\n"
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[10], [x26]\n"
      "ld1 { v28.b }[10], [x25]\n"
      "b 33f\n"
      "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[8], [x26]\n"
      "ld1 { v28.b }[8], [x25]\n"
      "b 33f\n"
      "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
      "tbz x27, #2, 31f\n"
      "ldr s31, [x26], #0x4\n"
      "ldr s28, [x25], #0x4\n"
      "tbz x27, #1, 30f\n"
      "ld1 { v31.h }[2], [x26], #0x2\n"
      "ld1 { v28.h }[2], [x25], #0x2\n"
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[6], [x26]\n"
      "ld1 { v28.b }[6], [x25]\n"
      "b 33f\n"
      "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[4], [x26]\n"
      "ld1 { v28.b }[4], [x25]\n"
      "b 33f\n"
      "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
      "tbz x27, #1, 32f\n"
      "ldr h31, [x26], #0x2\n"
      "ldr h28, [x25], #0x2\n"
      "tbz x27, #0, 33f\n"
      "ld1 { v31.b }[2], [x26]\n"
      "ld1 { v28.b }[2], [x25]\n"
      "b 33f\n"
      "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
      "ldr b31, [x26, #0x0]\n"
      "ldr b28, [x25, #0x0]\n"
      "33:"  // Height 2: Multiply loop: Ragged operand read: Done
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "34:"  // Height 2: Multiply loop: No odd multiplies
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 19b\n"
      "uadalp v0.4s, v1.8h\n"
      "uadalp v29.4s, v30.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "addp v0.4s, v0.4s, v0.4s\n"
      "mul v0.4s, v0.4s, v2.4s\n"
      "str d0, [%x[out_ptr]], #0x8\n"
      "b 104f\n"
      "35:"  // Height 3
      "movi v1.8h, #0x0\n"
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "mov x9, #0x0\n"
      "movi v0.4s, #0x0\n"
      "mov x28, #0x0\n"
      "movi v30.8h, #0x0\n"
      "movi v29.4s, #0x0\n"
      "movi v27.8h, #0x0\n"
      "movi v26.4s, #0x0\n"
      "36:"  // Height 3: String loop
      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr w27, [x19, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 37f\n"
      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x19, x19, %x[input_offset], LSL #3\n"
      "ldr x26, [x19, #0x0]\n"
      "ldr x25, [x19, #0x8]\n"
      "ldr x24, [x19, #0x10]\n"
      "cbnz x28, 38f\n"
      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x19\n"
      "add x25, x25, x19\n"
      "add x24, x24, x19\n"
      "b 38f\n"
      "37:"  // Height 3: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, %x[input_offset]\n"
      "add x24, x25, %x[input_offset]\n"
      "38:"  // Height 3: input setup done
      "cmp x27, #0x10\n"
      "blt 42f\n"
      "cmp x27, #0x20\n"
      "blt 41f\n"
      "39:"  // Height 3: Multiply loop: Main loop head
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "cmp x9, #0x7e\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "blt 40f\n"
      "uadalp v0.4s, v1.8h\n"
      "movi v1.8h, #0x0\n"
      "uadalp v29.4s, v30.8h\n"
      "movi v30.8h, #0x0\n"
      "uadalp v26.4s, v27.8h\n"
      "movi v27.8h, #0x0\n"
      "mov x9, #0x0\n"
      "40:"  // Height 3: Multiply loop: unique 3: no collapse
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "add x9, x9, #0x1\n"
      "sub x27, x27, #0x10\n"
      "cmp x27, #0x20\n"
      "bge 39b\n"
      "41:"  // Height 3: Multiply loop: Single iteration only
      "sub x27, x27, #0x10\n"
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "add x24, x24, #0x10\n"
      "42:"  // Height 3: Multiply loop: Main loop skip
      "cbz x27, 51f\n"
      "tbz x27, #3, 46f\n"
      "ldr d31, [x26], #0x8\n"
      "ldr d28, [x25], #0x8\n"
      "ldr d25, [x24], #0x8\n"
      "tbz x27, #2, 44f\n"
      "ld1 { v31.s }[2], [x26], #0x4\n"
      "ld1 { v28.s }[2], [x25], #0x4\n"
      "ld1 { v25.s }[2], [x24], #0x4\n"
      "tbz x27, #1, 43f\n"
      "ld1 { v31.h }[6], [x26], #0x2\n"
      "ld1 { v28.h }[6], [x25], #0x2\n"
      "ld1 { v25.h }[6], [x24], #0x2\n"
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[14], [x26]\n"
      "ld1 { v28.b }[14], [x25]\n"
      "ld1 { v25.b }[14], [x24]\n"
      "b 50f\n"
      "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[12], [x26]\n"
      "ld1 { v28.b }[12], [x25]\n"
      "ld1 { v25.b }[12], [x24]\n"
      "b 50f\n"
      "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
      "tbz x27, #1, 45f\n"
      "ld1 { v31.h }[4], [x26], #0x2\n"
      "ld1 { v28.h }[4], [x25], #0x2\n"
      "ld1 { v25.h }[4], [x24], #0x2\n"
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[10], [x26]\n"
      "ld1 { v28.b }[10], [x25]\n"
      "ld1 { v25.b }[10], [x24]\n"
      "b 50f\n"
      "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[8], [x26]\n"
      "ld1 { v28.b }[8], [x25]\n"
      "ld1 { v25.b }[8], [x24]\n"
      "b 50f\n"
      "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
      "tbz x27, #2, 48f\n"
      "ldr s31, [x26], #0x4\n"
      "ldr s28, [x25], #0x4\n"
      "ldr s25, [x24], #0x4\n"
      "tbz x27, #1, 47f\n"
      "ld1 { v31.h }[2], [x26], #0x2\n"
      "ld1 { v28.h }[2], [x25], #0x2\n"
      "ld1 { v25.h }[2], [x24], #0x2\n"
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[6], [x26]\n"
      "ld1 { v28.b }[6], [x25]\n"
      "ld1 { v25.b }[6], [x24]\n"
      "b 50f\n"
      "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[4], [x26]\n"
      "ld1 { v28.b }[4], [x25]\n"
      "ld1 { v25.b }[4], [x24]\n"
      "b 50f\n"
      "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
      "tbz x27, #1, 49f\n"
      "ldr h31, [x26], #0x2\n"
      "ldr h28, [x25], #0x2\n"
      "ldr h25, [x24], #0x2\n"
      "tbz x27, #0, 50f\n"
      "ld1 { v31.b }[2], [x26]\n"
      "ld1 { v28.b }[2], [x25]\n"
      "ld1 { v25.b }[2], [x24]\n"
      "b 50f\n"
      "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
      "ldr b31, [x26, #0x0]\n"
      "ldr b28, [x25, #0x0]\n"
      "ldr b25, [x24, #0x0]\n"
      "50:"  // Height 3: Multiply loop: Ragged operand read: Done
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "51:"  // Height 3: Multiply loop: No odd multiplies
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 36b\n"
      "uadalp v0.4s, v1.8h\n"
      "uadalp v29.4s, v30.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "uadalp v26.4s, v27.8h\n"
      "addp v0.4s, v0.4s, v0.4s\n"
      "addp v26.4s, v26.4s, v26.4s\n"
      "mul v0.4s, v0.4s, v2.4s\n"
      "str d0, [%x[out_ptr]], #0x8\n"
      "addp v26.4s, v26.4s, v26.4s\n"
      "mul v26.4s, v26.4s, v2.4s\n"
      "str s26, [%x[out_ptr]], #0x4\n"
      "b 104f\n"
      "52:"  // Height 4
      "movi v1.8h, #0x0\n"
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "mov x9, #0x0\n"
      "movi v0.4s, #0x0\n"
      "mov x28, #0x0\n"
      "movi v30.8h, #0x0\n"
      "movi v29.4s, #0x0\n"
      "movi v27.8h, #0x0\n"
      "movi v26.4s, #0x0\n"
      "movi v24.8h, #0x0\n"
      "movi v23.4s, #0x0\n"
      "53:"  // Height 4: String loop
      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr w27, [x19, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 54f\n"
      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x19, x19, %x[input_offset], LSL #3\n"
      "ldr x26, [x19, #0x0]\n"
      "ldr x25, [x19, #0x8]\n"
      "ldr x24, [x19, #0x10]\n"
      "ldr x23, [x19, #0x18]\n"
      "cbnz x28, 55f\n"
      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x19\n"
      "add x25, x25, x19\n"
      "add x24, x24, x19\n"
      "add x23, x23, x19\n"
      "b 55f\n"
      "54:"  // Height 4: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, %x[input_offset]\n"
      "add x24, x25, %x[input_offset]\n"
      "add x23, x24, %x[input_offset]\n"
      "55:"  // Height 4: input setup done
      "cmp x27, #0x10\n"
      "blt 59f\n"
      "cmp x27, #0x20\n"
      "blt 58f\n"
      "56:"  // Height 4: Multiply loop: Main loop head
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "ldr q22, [x23, #0x0]\n"
      "cmp x9, #0x7e\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "blt 57f\n"
      "uadalp v0.4s, v1.8h\n"
      "movi v1.8h, #0x0\n"
      "uadalp v29.4s, v30.8h\n"
      "movi v30.8h, #0x0\n"
      "uadalp v26.4s, v27.8h\n"
      "movi v27.8h, #0x0\n"
      "uadalp v23.4s, v24.8h\n"
      "movi v24.8h, #0x0\n"
      "mov x9, #0x0\n"
      "57:"  // Height 4: Multiply loop: unique 4: no collapse
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "add x9, x9, #0x1\n"
      "sub x27, x27, #0x10\n"
      "cmp x27, #0x20\n"
      "bge 56b\n"
      "58:"  // Height 4: Multiply loop: Single iteration only
      "sub x27, x27, #0x10\n"
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "ldr q22, [x23, #0x0]\n"
      "add x26, x26, #0x10\n"
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "59:"  // Height 4: Multiply loop: Main loop skip
      "cbz x27, 68f\n"
      "tbz x27, #3, 63f\n"
      "ldr d31, [x26], #0x8\n"
      "ldr d28, [x25], #0x8\n"
      "ldr d25, [x24], #0x8\n"
      "ldr d22, [x23], #0x8\n"
      "tbz x27, #2, 61f\n"
      "ld1 { v31.s }[2], [x26], #0x4\n"
      "ld1 { v28.s }[2], [x25], #0x4\n"
      "ld1 { v25.s }[2], [x24], #0x4\n"
      "ld1 { v22.s }[2], [x23], #0x4\n"
      "tbz x27, #1, 60f\n"
      "ld1 { v31.h }[6], [x26], #0x2\n"
      "ld1 { v28.h }[6], [x25], #0x2\n"
      "ld1 { v25.h }[6], [x24], #0x2\n"
      "ld1 { v22.h }[6], [x23], #0x2\n"
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[14], [x26]\n"
      "ld1 { v28.b }[14], [x25]\n"
      "ld1 { v25.b }[14], [x24]\n"
      "ld1 { v22.b }[14], [x23]\n"
      "b 67f\n"
      "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[12], [x26]\n"
      "ld1 { v28.b }[12], [x25]\n"
      "ld1 { v25.b }[12], [x24]\n"
      "ld1 { v22.b }[12], [x23]\n"
      "b 67f\n"
      "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
      "tbz x27, #1, 62f\n"
      "ld1 { v31.h }[4], [x26], #0x2\n"
      "ld1 { v28.h }[4], [x25], #0x2\n"
      "ld1 { v25.h }[4], [x24], #0x2\n"
      "ld1 { v22.h }[4], [x23], #0x2\n"
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[10], [x26]\n"
      "ld1 { v28.b }[10], [x25]\n"
      "ld1 { v25.b }[10], [x24]\n"
      "ld1 { v22.b }[10], [x23]\n"
      "b 67f\n"
      "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[8], [x26]\n"
      "ld1 { v28.b }[8], [x25]\n"
      "ld1 { v25.b }[8], [x24]\n"
      "ld1 { v22.b }[8], [x23]\n"
      "b 67f\n"
      "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
      "tbz x27, #2, 65f\n"
      "ldr s31, [x26], #0x4\n"
      "ldr s28, [x25], #0x4\n"
      "ldr s25, [x24], #0x4\n"
      "ldr s22, [x23], #0x4\n"
      "tbz x27, #1, 64f\n"
      "ld1 { v31.h }[2], [x26], #0x2\n"
      "ld1 { v28.h }[2], [x25], #0x2\n"
      "ld1 { v25.h }[2], [x24], #0x2\n"
      "ld1 { v22.h }[2], [x23], #0x2\n"
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[6], [x26]\n"
      "ld1 { v28.b }[6], [x25]\n"
      "ld1 { v25.b }[6], [x24]\n"
      "ld1 { v22.b }[6], [x23]\n"
      "b 67f\n"
      "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[4], [x26]\n"
      "ld1 { v28.b }[4], [x25]\n"
      "ld1 { v25.b }[4], [x24]\n"
      "ld1 { v22.b }[4], [x23]\n"
      "b 67f\n"
      "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
      "tbz x27, #1, 66f\n"
      "ldr h31, [x26], #0x2\n"
      "ldr h28, [x25], #0x2\n"
      "ldr h25, [x24], #0x2\n"
      "ldr h22, [x23], #0x2\n"
      "tbz x27, #0, 67f\n"
      "ld1 { v31.b }[2], [x26]\n"
      "ld1 { v28.b }[2], [x25]\n"
      "ld1 { v25.b }[2], [x24]\n"
      "ld1 { v22.b }[2], [x23]\n"
      "b 67f\n"
      "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
      "ldr b31, [x26, #0x0]\n"
      "ldr b28, [x25, #0x0]\n"
      "ldr b25, [x24, #0x0]\n"
      "ldr b22, [x23, #0x0]\n"
      "67:"  // Height 4: Multiply loop: Ragged operand read: Done
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "68:"  // Height 4: Multiply loop: No odd multiplies
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 53b\n"
      "uadalp v0.4s, v1.8h\n"
      "uadalp v29.4s, v30.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "uadalp v26.4s, v27.8h\n"
      "uadalp v23.4s, v24.8h\n"
      "addp v29.4s, v26.4s, v23.4s\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "mul v0.4s, v0.4s, v2.4s\n"
      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
      "b 104f\n"
      "69:"  // Height 5
      "movi v1.8h, #0x0\n"
      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "mov x9, #0x0\n"
      "movi v0.4s, #0x0\n"
      "mov x28, #0x0\n"
      "movi v30.8h, #0x0\n"
      "movi v29.4s, #0x0\n"
      "movi v27.8h, #0x0\n"
      "movi v26.4s, #0x0\n"
      "movi v24.8h, #0x0\n"
      "movi v23.4s, #0x0\n"
      "movi v21.8h, #0x0\n"
      "movi v20.4s, #0x0\n"
      "70:"  // Height 5: String loop
      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr w27, [x19, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 71f\n"
      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x19, x19, %x[input_offset], LSL #3\n"
      "ldr x26, [x19, #0x0]\n"
      "ldr x25, [x19, #0x8]\n"
      "ldr x24, [x19, #0x10]\n"
      "ldr x23, [x19, #0x18]\n"
      "ldr x22, [x19, #0x20]\n"
      "cbnz x28, 72f\n"
      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x19\n"
      "add x25, x25, x19\n"
      "add x24, x24, x19\n"
      "add x23, x23, x19\n"
      "add x22, x22, x19\n"
      "b 72f\n"
      "71:"  // Height 5: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, %x[input_offset]\n"
      "add x24, x25, %x[input_offset]\n"
      "add x23, x24, %x[input_offset]\n"
      "add x22, x23, %x[input_offset]\n"
      "72:"  // Height 5: input setup done
      "cmp x27, #0x10\n"
      "blt 76f\n"
      "cmp x27, #0x20\n"
      "blt 75f\n"
      "73:"  // Height 5: Multiply loop: Main loop head
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "ldr q22, [x23, #0x0]\n"
      "ldr q19, [x22, #0x0]\n"
      "cmp x9, #0x7e\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "blt 74f\n"
      "uadalp v0.4s, v1.8h\n"
      "movi v1.8h, #0x0\n"
      "uadalp v29.4s, v30.8h\n"
      "movi v30.8h, #0x0\n"
      "uadalp v26.4s, v27.8h\n"
      "movi v27.8h, #0x0\n"
      "uadalp v23.4s, v24.8h\n"
      "movi v24.8h, #0x0\n"
      "uadalp v20.4s, v21.8h\n"
      "movi v21.8h, #0x0\n"
      "mov x9, #0x0\n"
      "74:"  // Height 5: Multiply loop: unique 5: no collapse
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "uadalp v21.8h, v19.16b\n"
      "add x9, x9, #0x1\n"
      "sub x27, x27, #0x10\n"
      "cmp x27, #0x20\n"
      "bge 73b\n"
      "75:"  // Height 5: Multiply loop: Single iteration only
      "sub x27, x27, #0x10\n"
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "ldr q22, [x23, #0x0]\n"
      "ldr q19, [x22, #0x0]\n"
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "uadalp v21.8h, v19.16b\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "76:"  // Height 5: Multiply loop: Main loop skip
      "cbz x27, 85f\n"
      "tbz x27, #3, 80f\n"
      "ldr d31, [x26], #0x8\n"
      "ldr d28, [x25], #0x8\n"
      "ldr d25, [x24], #0x8\n"
      "ldr d22, [x23], #0x8\n"
      "ldr d19, [x22], #0x8\n"
      "tbz x27, #2, 78f\n"
      "ld1 { v31.s }[2], [x26], #0x4\n"
      "ld1 { v28.s }[2], [x25], #0x4\n"
      "ld1 { v25.s }[2], [x24], #0x4\n"
      "ld1 { v22.s }[2], [x23], #0x4\n"
      "ld1 { v19.s }[2], [x22], #0x4\n"
      "tbz x27, #1, 77f\n"
      "ld1 { v31.h }[6], [x26], #0x2\n"
      "ld1 { v28.h }[6], [x25], #0x2\n"
      "ld1 { v25.h }[6], [x24], #0x2\n"
      "ld1 { v22.h }[6], [x23], #0x2\n"
      "ld1 { v19.h }[6], [x22], #0x2\n"
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[14], [x26]\n"
      "ld1 { v28.b }[14], [x25]\n"
      "ld1 { v25.b }[14], [x24]\n"
      "ld1 { v22.b }[14], [x23]\n"
      "ld1 { v19.b }[14], [x22]\n"
      "b 84f\n"
      "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[12], [x26]\n"
      "ld1 { v28.b }[12], [x25]\n"
      "ld1 { v25.b }[12], [x24]\n"
      "ld1 { v22.b }[12], [x23]\n"
      "ld1 { v19.b }[12], [x22]\n"
      "b 84f\n"
      "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
      "tbz x27, #1, 79f\n"
      "ld1 { v31.h }[4], [x26], #0x2\n"
      "ld1 { v28.h }[4], [x25], #0x2\n"
      "ld1 { v25.h }[4], [x24], #0x2\n"
      "ld1 { v22.h }[4], [x23], #0x2\n"
      "ld1 { v19.h }[4], [x22], #0x2\n"
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[10], [x26]\n"
      "ld1 { v28.b }[10], [x25]\n"
      "ld1 { v25.b }[10], [x24]\n"
      "ld1 { v22.b }[10], [x23]\n"
      "ld1 { v19.b }[10], [x22]\n"
      "b 84f\n"
      "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[8], [x26]\n"
      "ld1 { v28.b }[8], [x25]\n"
      "ld1 { v25.b }[8], [x24]\n"
      "ld1 { v22.b }[8], [x23]\n"
      "ld1 { v19.b }[8], [x22]\n"
      "b 84f\n"
      "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
      "tbz x27, #2, 82f\n"
      "ldr s31, [x26], #0x4\n"
      "ldr s28, [x25], #0x4\n"
      "ldr s25, [x24], #0x4\n"
      "ldr s22, [x23], #0x4\n"
      "ldr s19, [x22], #0x4\n"
      "tbz x27, #1, 81f\n"
      "ld1 { v31.h }[2], [x26], #0x2\n"
      "ld1 { v28.h }[2], [x25], #0x2\n"
      "ld1 { v25.h }[2], [x24], #0x2\n"
      "ld1 { v22.h }[2], [x23], #0x2\n"
      "ld1 { v19.h }[2], [x22], #0x2\n"
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[6], [x26]\n"
      "ld1 { v28.b }[6], [x25]\n"
      "ld1 { v25.b }[6], [x24]\n"
      "ld1 { v22.b }[6], [x23]\n"
      "ld1 { v19.b }[6], [x22]\n"
      "b 84f\n"
      "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[4], [x26]\n"
      "ld1 { v28.b }[4], [x25]\n"
      "ld1 { v25.b }[4], [x24]\n"
      "ld1 { v22.b }[4], [x23]\n"
      "ld1 { v19.b }[4], [x22]\n"
      "b 84f\n"
      "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
      "tbz x27, #1, 83f\n"
      "ldr h31, [x26], #0x2\n"
      "ldr h28, [x25], #0x2\n"
      "ldr h25, [x24], #0x2\n"
      "ldr h22, [x23], #0x2\n"
      "ldr h19, [x22], #0x2\n"
      "tbz x27, #0, 84f\n"
      "ld1 { v31.b }[2], [x26]\n"
      "ld1 { v28.b }[2], [x25]\n"
      "ld1 { v25.b }[2], [x24]\n"
      "ld1 { v22.b }[2], [x23]\n"
      "ld1 { v19.b }[2], [x22]\n"
      "b 84f\n"
      "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
      "ldr b31, [x26, #0x0]\n"
      "ldr b28, [x25, #0x0]\n"
      "ldr b25, [x24, #0x0]\n"
      "ldr b22, [x23, #0x0]\n"
      "ldr b19, [x22, #0x0]\n"
      "84:"  // Height 5: Multiply loop: Ragged operand read: Done
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "uadalp v21.8h, v19.16b\n"
      "85:"  // Height 5: Multiply loop: No odd multiplies
      "add x28, x28, #0x1\n"
      "cmp x28, x20\n"
      "bne 70b\n"
      "uadalp v0.4s, v1.8h\n"
      "uadalp v29.4s, v30.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "uadalp v26.4s, v27.8h\n"
      "uadalp v23.4s, v24.8h\n"
      "addp v29.4s, v26.4s, v23.4s\n"
      "uadalp v20.4s, v21.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "addp v20.4s, v20.4s, v20.4s\n"
      "mul v0.4s, v0.4s, v2.4s\n"
      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
      "addp v20.4s, v20.4s, v20.4s\n"
      "mul v20.4s, v20.4s, v2.4s\n"
      "str s20, [%x[out_ptr]], #0x4\n"
      "b 104f\n"
      "86:"  // Height 6
      "movi v1.8h, #0x0\n"
      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
      "mov x9, #0x0\n"
      "movi v0.4s, #0x0\n"
      "mov x28, #0x0\n"
      "movi v30.8h, #0x0\n"
      "movi v29.4s, #0x0\n"
      "movi v27.8h, #0x0\n"
      "movi v26.4s, #0x0\n"
      "movi v24.8h, #0x0\n"
      "movi v23.4s, #0x0\n"
      "movi v21.8h, #0x0\n"
      "movi v20.4s, #0x0\n"
      "movi v18.8h, #0x0\n"
      "movi v17.4s, #0x0\n"
      "87:"  // Height 6: String loop
      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
      "ldr w27, [x19, x28, LSL #0x2]\n"
      "tbz %x[flags], #3, 88f\n"
      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
      "add x19, x19, %x[input_offset], LSL #3\n"
      "ldr x26, [x19, #0x0]\n"
      "ldr x25, [x19, #0x8]\n"
      "ldr x24, [x19, #0x10]\n"
      "ldr x23, [x19, #0x18]\n"
      "ldr x22, [x19, #0x20]\n"
      "ldr x20, [x19, #0x28]\n"
      "cbnz x28, 89f\n"
      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
      "add x26, x26, x19\n"
      "add x25, x25, x19\n"
      "add x24, x24, x19\n"
      "add x23, x23, x19\n"
      "add x22, x22, x19\n"
      "add x20, x20, x19\n"
      "b 89f\n"
      "88:"  // Height 6: setup direct input
      "mov x26, %x[input_ptr]\n"
      "add x25, x26, %x[input_offset]\n"
      "add x24, x25, %x[input_offset]\n"
      "add x23, x24, %x[input_offset]\n"
      "add x22, x23, %x[input_offset]\n"
      "add x20, x22, %x[input_offset]\n"
      "89:"  // Height 6: input setup done
      "cmp x27, #0x10\n"
      "blt 93f\n"
      "cmp x27, #0x20\n"
      "blt 92f\n"
      "90:"  // Height 6: Multiply loop: Main loop head
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "ldr q22, [x23, #0x0]\n"
      "ldr q19, [x22, #0x0]\n"
      "ldr q16, [x20, #0x0]\n"
      "cmp x9, #0x7e\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "add x20, x20, #0x10\n"
      "blt 91f\n"
      "uadalp v0.4s, v1.8h\n"
      "movi v1.8h, #0x0\n"
      "uadalp v29.4s, v30.8h\n"
      "movi v30.8h, #0x0\n"
      "uadalp v26.4s, v27.8h\n"
      "movi v27.8h, #0x0\n"
      "uadalp v23.4s, v24.8h\n"
      "movi v24.8h, #0x0\n"
      "uadalp v20.4s, v21.8h\n"
      "movi v21.8h, #0x0\n"
      "uadalp v17.4s, v18.8h\n"
      "movi v18.8h, #0x0\n"
      "mov x9, #0x0\n"
      "91:"  // Height 6: Multiply loop: unique 6: no collapse
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "uadalp v21.8h, v19.16b\n"
      "uadalp v18.8h, v16.16b\n"
      "add x9, x9, #0x1\n"
      "sub x27, x27, #0x10\n"
      "cmp x27, #0x20\n"
      "bge 90b\n"
      "92:"  // Height 6: Multiply loop: Single iteration only
      "sub x27, x27, #0x10\n"
      "ldr q31, [x26, #0x0]\n"
      "ldr q28, [x25, #0x0]\n"
      "ldr q25, [x24, #0x0]\n"
      "ldr q22, [x23, #0x0]\n"
      "ldr q19, [x22, #0x0]\n"
      "ldr q16, [x20, #0x0]\n"
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "uadalp v21.8h, v19.16b\n"
      "uadalp v18.8h, v16.16b\n"
      "add x26, x26, #0x10\n"
      "add x25, x25, #0x10\n"
      "add x24, x24, #0x10\n"
      "add x23, x23, #0x10\n"
      "add x22, x22, #0x10\n"
      "add x20, x20, #0x10\n"
      "93:"  // Height 6: Multiply loop: Main loop skip
      "cbz x27, 102f\n"
      "tbz x27, #3, 97f\n"
      "ldr d31, [x26], #0x8\n"
      "ldr d28, [x25], #0x8\n"
      "ldr d25, [x24], #0x8\n"
      "ldr d22, [x23], #0x8\n"
      "ldr d19, [x22], #0x8\n"
      "ldr d16, [x20], #0x8\n"
      "tbz x27, #2, 95f\n"
      "ld1 { v31.s }[2], [x26], #0x4\n"
      "ld1 { v28.s }[2], [x25], #0x4\n"
      "ld1 { v25.s }[2], [x24], #0x4\n"
      "ld1 { v22.s }[2], [x23], #0x4\n"
      "ld1 { v19.s }[2], [x22], #0x4\n"
      "ld1 { v16.s }[2], [x20], #0x4\n"
      "tbz x27, #1, 94f\n"
      "ld1 { v31.h }[6], [x26], #0x2\n"
      "ld1 { v28.h }[6], [x25], #0x2\n"
      "ld1 { v25.h }[6], [x24], #0x2\n"
      "ld1 { v22.h }[6], [x23], #0x2\n"
      "ld1 { v19.h }[6], [x22], #0x2\n"
      "ld1 { v16.h }[6], [x20], #0x2\n"
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[14], [x26]\n"
      "ld1 { v28.b }[14], [x25]\n"
      "ld1 { v25.b }[14], [x24]\n"
      "ld1 { v22.b }[14], [x23]\n"
      "ld1 { v19.b }[14], [x22]\n"
      "ld1 { v16.b }[14], [x20]\n"
      "b 101f\n"
      "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[12], [x26]\n"
      "ld1 { v28.b }[12], [x25]\n"
      "ld1 { v25.b }[12], [x24]\n"
      "ld1 { v22.b }[12], [x23]\n"
      "ld1 { v19.b }[12], [x22]\n"
      "ld1 { v16.b }[12], [x20]\n"
      "b 101f\n"
      "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
      "tbz x27, #1, 96f\n"
      "ld1 { v31.h }[4], [x26], #0x2\n"
      "ld1 { v28.h }[4], [x25], #0x2\n"
      "ld1 { v25.h }[4], [x24], #0x2\n"
      "ld1 { v22.h }[4], [x23], #0x2\n"
      "ld1 { v19.h }[4], [x22], #0x2\n"
      "ld1 { v16.h }[4], [x20], #0x2\n"
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[10], [x26]\n"
      "ld1 { v28.b }[10], [x25]\n"
      "ld1 { v25.b }[10], [x24]\n"
      "ld1 { v22.b }[10], [x23]\n"
      "ld1 { v19.b }[10], [x22]\n"
      "ld1 { v16.b }[10], [x20]\n"
      "b 101f\n"
      "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[8], [x26]\n"
      "ld1 { v28.b }[8], [x25]\n"
      "ld1 { v25.b }[8], [x24]\n"
      "ld1 { v22.b }[8], [x23]\n"
      "ld1 { v19.b }[8], [x22]\n"
      "ld1 { v16.b }[8], [x20]\n"
      "b 101f\n"
      "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
      "tbz x27, #2, 99f\n"
      "ldr s31, [x26], #0x4\n"
      "ldr s28, [x25], #0x4\n"
      "ldr s25, [x24], #0x4\n"
      "ldr s22, [x23], #0x4\n"
      "ldr s19, [x22], #0x4\n"
      "ldr s16, [x20], #0x4\n"
      "tbz x27, #1, 98f\n"
      "ld1 { v31.h }[2], [x26], #0x2\n"
      "ld1 { v28.h }[2], [x25], #0x2\n"
      "ld1 { v25.h }[2], [x24], #0x2\n"
      "ld1 { v22.h }[2], [x23], #0x2\n"
      "ld1 { v19.h }[2], [x22], #0x2\n"
      "ld1 { v16.h }[2], [x20], #0x2\n"
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[6], [x26]\n"
      "ld1 { v28.b }[6], [x25]\n"
      "ld1 { v25.b }[6], [x24]\n"
      "ld1 { v22.b }[6], [x23]\n"
      "ld1 { v19.b }[6], [x22]\n"
      "ld1 { v16.b }[6], [x20]\n"
      "b 101f\n"
      "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[4], [x26]\n"
      "ld1 { v28.b }[4], [x25]\n"
      "ld1 { v25.b }[4], [x24]\n"
      "ld1 { v22.b }[4], [x23]\n"
      "ld1 { v19.b }[4], [x22]\n"
      "ld1 { v16.b }[4], [x20]\n"
      "b 101f\n"
      "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
      "tbz x27, #1, 100f\n"
      "ldr h31, [x26], #0x2\n"
      "ldr h28, [x25], #0x2\n"
      "ldr h25, [x24], #0x2\n"
      "ldr h22, [x23], #0x2\n"
      "ldr h19, [x22], #0x2\n"
      "ldr h16, [x20], #0x2\n"
      "tbz x27, #0, 101f\n"
      "ld1 { v31.b }[2], [x26]\n"
      "ld1 { v28.b }[2], [x25]\n"
      "ld1 { v25.b }[2], [x24]\n"
      "ld1 { v22.b }[2], [x23]\n"
      "ld1 { v19.b }[2], [x22]\n"
      "ld1 { v16.b }[2], [x20]\n"
      "b 101f\n"
      "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
      "ldr b31, [x26, #0x0]\n"
      "ldr b28, [x25, #0x0]\n"
      "ldr b25, [x24, #0x0]\n"
      "ldr b22, [x23, #0x0]\n"
      "ldr b19, [x22, #0x0]\n"
      "ldr b16, [x20, #0x0]\n"
      "101:"  // Height 6: Multiply loop: Ragged operand read: Done
      "uadalp v1.8h, v31.16b\n"
      "uadalp v30.8h, v28.16b\n"
      "uadalp v27.8h, v25.16b\n"
      "uadalp v24.8h, v22.16b\n"
      "uadalp v21.8h, v19.16b\n"
      "uadalp v18.8h, v16.16b\n"
      "102:"  // Height 6: Multiply loop: No odd multiplies
      "add x28, x28, #0x1\n"
      "cmp x28, x21\n"
      "bne 87b\n"
      "uadalp v0.4s, v1.8h\n"
      "uadalp v29.4s, v30.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "uadalp v26.4s, v27.8h\n"
      "uadalp v23.4s, v24.8h\n"
      "addp v29.4s, v26.4s, v23.4s\n"
      "uadalp v20.4s, v21.8h\n"
      "uadalp v17.4s, v18.8h\n"
      "addp v0.4s, v0.4s, v29.4s\n"
      "subs %x[M], %x[M], #0x6\n"
      "addp v20.4s, v20.4s, v17.4s\n"
      "mul v0.4s, v0.4s, v2.4s\n"
      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
      "addp v20.4s, v20.4s, v20.4s\n"
      "mul v20.4s, v20.4s, v2.4s\n"
      "str d20, [%x[out_ptr]], #0x8\n"
      "beq 104f\n"
      "tbz %x[flags], #3, 103f\n"
      "add %x[input_offset], %x[input_offset], #0x6\n"
      "b 1b\n"
      "103:"  // Update direct input
      "mov x19, #0x6\n"
      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
      "b 1b\n"
      "104:"  // Exit

      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
    );
}

} // namespace arm_gemm

#endif // __aarch64__
